In [1]:
#Library imports
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from numpy import nan
In [2]:
#Data inspection of dataframe
df_iris = pd.read_csv("iris.csv") #Read the clean data file

# Returns the first 3 rows, default 5
print("\n")  #Blank row
print("First 3 rows in dataframe:\n"
,df_iris.iloc[:, 0:6].head(3))

# Checking data types of columns
print("\n")  #Blank row
print("Data structure:\n")
df_iris.iloc[:, 0:6].info()

# Return data descriptive statistics(for numeric columns values)
print("\n")  #Blank row
print("Data description:\n")
df_iris.iloc[:, 0:6].describe()

# Return data values
#df_iris.values

num_df = df_iris.iloc[:,1:5] #dataframe with only numeric colums, for correlation etc.
# Heatmap
#.1e = scientific notation with 1 decimal point (standard form)
#.2f = 2 decimal places
#.3g = 3 significant figures
#.4% = percentage with 4 decimal places

plt.figure(figsize=(10, 8))
sns.heatmap(num_df.corr(), annot=True, cmap='coolwarm', vmin=-1, vmax=1, fmt='.3f')
plt.title("Heatmap of Correlation Matrix with Fixed Axis")
plt.show()

# Histograms
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots(
rows=(num_df.shape[1]//3)+1,
cols=3,
subplot_titles=num_df.columns
)

for i, col in enumerate(num_df.columns):
    fig.add_trace(go.Histogram(x=num_df[col], name=col), row=(i//3)+1, col=(i%3)+1)

fig.update_layout(height=1000, showlegend=False)

fig.show()

First 3 rows in dataframe:
    Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa


Data structure:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


Data description:

No description has been provided for this image
In [3]:
# Data import - Introducing unclean data
df_iris = pd.read_csv("iris.csv") #Read the clean data file

print("\nOriginal data shape:", df_iris.shape) #dataframe dimensions(Nr of rows, Nr of columns)

# Introducing unclean data to the dataset, for the purpose for demonstrating data engineering
df_iris_unclean  = pd.read_csv("unclean_iris.csv") #Read the unclean data file
df_concatenated = pd.concat([df_iris , df_iris_unclean], ignore_index=False) #merge both files

print("\nData shape after adding unclean data:",df_concatenated.shape)
Original data shape: (150, 6)

Data shape after adding unclean data: (170, 6)
In [4]:
# Clean the data in numeric columns:
# 1. Data Type Errors - numeric columns have non-numeric data
# 2. Missing values in various columns

# Numeric columns to check
numeric_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
#print("DataFrame before cleaning:\n", df_concatenated)


# 1. Handle Data Type Errors (Convert to numeric, replace errors with NaN) # Convert specific numeric columns to numeric data types
errors_found = False
for col in numeric_cols:
    try:
        df_concatenated[col] = pd.to_numeric(df_concatenated[col])
    except ValueError as e:
        print(f"\nError converting '{col}' to numeric: {e}")
        print(f"Non-numeric values found in '{col}'. Replacing with NaN.")
        df_concatenated[col] = pd.to_numeric(df_concatenated[col], errors='coerce') #handle errors
        errors_found = True

if errors_found:
    print("\nMissing values after numeric conversion:\n", df_concatenated.isnull().sum())


# 2. Handle Missing Values (Remove rows with NaN) 
initial_missing_count = df_concatenated.isnull().sum().sum() # Check for missing values before removing rows
print("\nMissing values before removing rows:", initial_missing_count)
df_concatenated.dropna(inplace=True) # Remove rows with any missing values (NaN), inplace=True to modify the DataFrame directly.
#df_concatenated = df_concatenated.dropna() # dropna() removes rows with at least one NaN value.
#df = df_concatenated  # Assign the cleaned DataFrame to the 'df' variable, This ensures that the cleaned data is used in subsequent steps.



# 3. Verify that missing values have been removed.
if df_concatenated.isnull().sum().sum() == 0:
    if initial_missing_count == 0:
        print("\nNo missing values were present initially.")
    else:
        print("\nMissing values successfully removed.")
else:
    print("\nWarning: Missing values still present after removal.")
    print("\nRemaining missing values:\n", df_concatenated.isnull().sum())

print("\nData shape after removing NaN rows:", df_concatenated.shape)
Error converting 'PetalLengthCm' to numeric: Unable to parse string "error" at position 152
Non-numeric values found in 'PetalLengthCm'. Replacing with NaN.

Missing values after numeric conversion:
 Id               0
SepalLengthCm    0
SepalWidthCm     2
PetalLengthCm    1
PetalWidthCm     1
Species          0
dtype: int64

Missing values before removing rows: 4

Missing values successfully removed.

Data shape after removing NaN rows: (166, 6)
In [5]:
# Clean the data in str column:
# 1. Extra Whitespace in the 'Species' column (leading or trailing whitespace)
# 2. Inconsistent formatting in the 'Species' column (species names have variations in capitalization and spelling.)
# 3. Drop rows with empty strings or NaN values in 'Species'

# 1. Remove Whitespace in 'Species'
original_species = df_concatenated['Species'].astype(str)
cleaned_species = original_species.str.strip()  #Remove whitespace from each string element in the current column, fast vectorized string operation
whitespace_removed = (original_species != cleaned_species).sum()

print(f"\nWhitespace removed from 'Species' column: {whitespace_removed}")

df_concatenated['Species'] = cleaned_species



# 2. Correct Inconsistent Formatting in 'Species' 
print("\nUnique Species labels before formatting:\n", sorted(df_concatenated['Species'].unique()))

species_mapping = {
    'Iris-vericolour': 'Iris-versicolor',
    'Iris-VERSICOLOR': 'Iris-versicolor',
    'Iris-vers': 'Iris-versicolor',
    'Iris-SETOSA': 'Iris-setosa',
}

df_concatenated['Species'] = df_concatenated['Species'].replace(species_mapping)



# 3. Drop rows with empty strings or NaN values
df_concatenated = df_concatenated[df_concatenated['Species'].replace('', np.nan).notna()] # Use numpy.nan for consistency


# 4. Verify lable result
print("\nUnique Species labels after cleaning:\n", sorted(df_concatenated['Species'].unique()))
Whitespace removed from 'Species' column: 2

Unique Species labels before formatting:
 ['Iris-SETOSA', 'Iris-VERSICOLOR', 'Iris-setosa', 'Iris-vericolour', 'Iris-vers', 'Iris-versicolor', 'Iris-virginica']

Unique Species labels after cleaning:
 ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']
In [6]:
# Clean the data: Remove duplicates rows

# Get a list of columns excluding the index column
columns_to_check = [col for col in df_concatenated.columns if col != 'Id']

# Store the original number of rows
original_row_count = len(df_concatenated)

# Drop duplicate rows based on the other columns
df_no_duplicates = df_concatenated.drop_duplicates(subset=columns_to_check)  # drop_duplicates() checks all columns by default.

# Store the number of rows after duplicate removal
new_row_count = len(df_no_duplicates)

# Calculate the number of rows removed
rows_removed = original_row_count - new_row_count

print(f"Original number of rows: {original_row_count}")
print(f"Number of rows after duplicate removal: {new_row_count}")
print(f"Number of rows removed: {rows_removed}")
Original number of rows: 166
Number of rows after duplicate removal: 155
Number of rows removed: 11
In [7]:
# Feature Engineering for K-Nearest Neighbors (KNN)
# Label Encoding of 'Species' column
from sklearn.preprocessing import LabelEncoder

def encode_species(df):
    df_copy = df.copy()  # Create a copy within the function
    label_encoder = LabelEncoder()
    df_copy.loc[:, 'Species_encoded'] = label_encoder.fit_transform(df_copy['Species'])
    return df_copy  # Return the copy

# Example Usage (assuming df_no_duplicates is your DataFrame)
df_iris = encode_species(df_no_duplicates.copy())  # Create a copy before passing

# 4. Verify label result
# Print unique Species labels after encoding
print("\nUnique Species labels with encoded values:")

# Create a mapping dictionary for clarity
label_encoder = LabelEncoder()
label_encoder.fit(df_iris['Species'])
species_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Print unique labels and their encoded values
for species, encoded_value in species_mapping.items():
    print(f"'{species}': {encoded_value}")
Unique Species labels with encoded values:
'Iris-setosa': 0
'Iris-versicolor': 1
'Iris-virginica': 2
In [8]:
# Train and evaluate K-Nearest Neighbors(KNN) model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Separate features (X) and target (y)
X = df_iris.drop(['Species', 'Species_encoded'], axis=1)
y = df_iris['Species_encoded']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Engineering: Scale features with StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define the hyperparameter grid
param_grid = {
    'n_neighbors': range(1, 21),
    'metric': ['euclidean', 'manhattan', 'minkowski'],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # Only for Minkowski
}

# Create the KNN classifier
knn = KNeighborsClassifier()

# Perform grid search
grid_search = GridSearchCV(knn, param_grid, cv=5)  # 5-fold cross-validation
grid_search.fit(X_train_scaled, y_train)  # use scaled X_train

# Get the best hyperparameters
best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

# Evaluate the model with the best hyperparameters
best_knn = KNeighborsClassifier(**best_params)
best_knn.fit(X_train_scaled, y_train)  # use scaled X_train
y_pred = best_knn.predict(X_test_scaled)  # use scaled X_test

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

print("\nAccuracy:", accuracy)
print("Precision:", precision_score(y_test, y_pred, average='weighted'))  # Weighted average for multi-class
print("Recall:", recall_score(y_test, y_pred, average='weighted'))  # Weighted average for multi-class
print("F1-Score:", f1_score(y_test, y_pred, average='weighted'))  # Weighted average for multi-class
print("\nClassification Report:\n", classification_report(y_test, y_pred))
Best Hyperparameters: {'metric': 'euclidean', 'n_neighbors': 3, 'p': 1, 'weights': 'uniform'}

Accuracy: 0.9375
Precision: 0.95
Recall: 0.9375
F1-Score: 0.9385683760683761

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      0.86      0.92         7
           2       0.80      1.00      0.89         4

    accuracy                           0.94        16
   macro avg       0.93      0.95      0.94        16
weighted avg       0.95      0.94      0.94        16

In [9]:
# Model data visualization
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix


# Plot Confusion Matrix for KNN-model
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_) #use the label encoder that was used to create the encoded species.
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


# Plot Pair Plot for KNN-model
def create_pairplot(df):
    sns.pairplot(df, hue='Species', vars=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'])
    plt.show()

create_pairplot(df_iris)
No description has been provided for this image
No description has been provided for this image
In [ ]: